-
Notifications
You must be signed in to change notification settings - Fork 11.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AArch64: Use consistent atomicrmw expansion for FP operations #103702
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-aarch64 Author: Matt Arsenault (arsenm) ChangesUse LLSC or cmpxchg in the same cases as for the unsupported Patch is 88.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/103702.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 40d9fa4f2b494a..26f698898e487b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27061,9 +27061,6 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
- if (AI->isFloatingPointOperation())
- return AtomicExpansionKind::CmpXChg;
-
bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
(AI->getOperation() == AtomicRMWInst::Xchg ||
AI->getOperation() == AtomicRMWInst::Or ||
@@ -27073,7 +27070,8 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
// Nand is not supported in LSE.
// Leave 128 bits to LLSC or CmpXChg.
- if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
+ if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
+ !AI->isFloatingPointOperation()) {
if (Subtarget->hasLSE())
return AtomicExpansionKind::None;
if (Subtarget->outlineAtomics()) {
@@ -27146,10 +27144,14 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
- Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
- Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
- return Builder.CreateOr(
- Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
+
+ auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
+ Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
+ Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
+
+ Value *Or = Builder.CreateOr(
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
+ return Builder.CreateBitCast(Or, ValueTy);
}
Type *Tys[] = { Addr->getType() };
@@ -27160,8 +27162,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
const DataLayout &DL = M->getDataLayout();
IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
CallInst *CI = Builder.CreateCall(Ldxr, Addr);
- CI->addParamAttr(
- 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
+ CI->addParamAttr(0, Attribute::get(Builder.getContext(),
+ Attribute::ElementType, IntEltTy));
Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
return Builder.CreateBitCast(Trunc, ValueTy);
@@ -27187,9 +27189,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
Function *Stxr = Intrinsic::getDeclaration(M, Int);
Type *Int64Ty = Type::getInt64Ty(M->getContext());
+ Type *Int128Ty = Type::getInt128Ty(M->getContext());
+
+ Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
- Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
- Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+ Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
+ Value *Hi =
+ Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
}
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
index f95caf325b197c..2c6461097f7d9b 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -6,33 +6,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2:
; NOLSE: // %bb.0:
; NOLSE-NEXT: fcvt s1, h0
-; NOLSE-NEXT: ldr h0, [x0]
-; NOLSE-NEXT: b .LBB0_2
; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1
-; NOLSE-NEXT: fmov s0, w10
-; NOLSE-NEXT: cmp w10, w9, uxth
-; NOLSE-NEXT: b.eq .LBB0_5
-; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxrh w8, [x0]
+; NOLSE-NEXT: fmov s0, w8
; NOLSE-NEXT: fcvt s2, h0
-; NOLSE-NEXT: fmov w9, s0
; NOLSE-NEXT: fadd s2, s2, s1
; NOLSE-NEXT: fcvt h2, s2
; NOLSE-NEXT: fmov w8, s2
-; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxrh w10, [x0]
-; NOLSE-NEXT: cmp w10, w9, uxth
-; NOLSE-NEXT: b.ne .LBB0_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2
-; NOLSE-NEXT: stlxrh wzr, w8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB0_3
-; NOLSE-NEXT: b .LBB0_1
-; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxrh w9, w8, [x0]
+; NOLSE-NEXT: cbnz w9, .LBB0_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0
; NOLSE-NEXT: ret
;
@@ -63,33 +47,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4:
; NOLSE: // %bb.0:
; NOLSE-NEXT: fcvt s1, h0
-; NOLSE-NEXT: ldr h0, [x0]
-; NOLSE-NEXT: b .LBB1_2
; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1
-; NOLSE-NEXT: fmov s0, w10
-; NOLSE-NEXT: cmp w10, w9, uxth
-; NOLSE-NEXT: b.eq .LBB1_5
-; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxrh w8, [x0]
+; NOLSE-NEXT: fmov s0, w8
; NOLSE-NEXT: fcvt s2, h0
-; NOLSE-NEXT: fmov w9, s0
; NOLSE-NEXT: fadd s2, s2, s1
; NOLSE-NEXT: fcvt h2, s2
; NOLSE-NEXT: fmov w8, s2
-; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxrh w10, [x0]
-; NOLSE-NEXT: cmp w10, w9, uxth
-; NOLSE-NEXT: b.ne .LBB1_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2
-; NOLSE-NEXT: stlxrh wzr, w8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB1_3
-; NOLSE-NEXT: b .LBB1_1
-; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxrh w9, w8, [x0]
+; NOLSE-NEXT: cbnz w9, .LBB1_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0
; NOLSE-NEXT: ret
;
@@ -122,19 +90,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0
; NOLSE-NEXT: fmov w9, s0
; NOLSE-NEXT: mov w8, #32767 // =0x7fff
-; NOLSE-NEXT: ldr h0, [x0]
; NOLSE-NEXT: lsl w9, w9, #16
; NOLSE-NEXT: fmov s1, w9
-; NOLSE-NEXT: b .LBB2_2
; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1
-; NOLSE-NEXT: fmov s0, w11
-; NOLSE-NEXT: cmp w11, w9, uxth
-; NOLSE-NEXT: b.eq .LBB2_5
-; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB2_3 Depth 2
-; NOLSE-NEXT: fmov w9, s0
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxrh w9, [x0]
+; NOLSE-NEXT: fmov s0, w9
; NOLSE-NEXT: lsl w9, w9, #16
; NOLSE-NEXT: fmov s2, w9
; NOLSE-NEXT: fadd s2, s2, s1
@@ -143,21 +104,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
; NOLSE-NEXT: add w9, w9, w8
; NOLSE-NEXT: add w9, w10, w9
; NOLSE-NEXT: lsr w9, w9, #16
-; NOLSE-NEXT: fmov s2, w9
-; NOLSE-NEXT: fmov w9, s0
-; NOLSE-NEXT: fmov w10, s2
-; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxrh w11, [x0]
-; NOLSE-NEXT: cmp w11, w9, uxth
-; NOLSE-NEXT: b.ne .LBB2_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2
-; NOLSE-NEXT: stlxrh wzr, w10, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB2_3
-; NOLSE-NEXT: b .LBB2_1
-; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxrh w10, w9, [x0]
+; NOLSE-NEXT: cbnz w10, .LBB2_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0
; NOLSE-NEXT: ret
;
@@ -199,19 +148,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0
; NOLSE-NEXT: fmov w9, s0
; NOLSE-NEXT: mov w8, #32767 // =0x7fff
-; NOLSE-NEXT: ldr h0, [x0]
; NOLSE-NEXT: lsl w9, w9, #16
; NOLSE-NEXT: fmov s1, w9
-; NOLSE-NEXT: b .LBB3_2
; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1
-; NOLSE-NEXT: fmov s0, w11
-; NOLSE-NEXT: cmp w11, w9, uxth
-; NOLSE-NEXT: b.eq .LBB3_5
-; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB3_3 Depth 2
-; NOLSE-NEXT: fmov w9, s0
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxrh w9, [x0]
+; NOLSE-NEXT: fmov s0, w9
; NOLSE-NEXT: lsl w9, w9, #16
; NOLSE-NEXT: fmov s2, w9
; NOLSE-NEXT: fadd s2, s2, s1
@@ -220,21 +162,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
; NOLSE-NEXT: add w9, w9, w8
; NOLSE-NEXT: add w9, w10, w9
; NOLSE-NEXT: lsr w9, w9, #16
-; NOLSE-NEXT: fmov s2, w9
-; NOLSE-NEXT: fmov w9, s0
-; NOLSE-NEXT: fmov w10, s2
-; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxrh w11, [x0]
-; NOLSE-NEXT: cmp w11, w9, uxth
-; NOLSE-NEXT: b.ne .LBB3_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2
-; NOLSE-NEXT: stlxrh wzr, w10, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB3_3
-; NOLSE-NEXT: b .LBB3_1
-; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxrh w10, w9, [x0]
+; NOLSE-NEXT: cbnz w10, .LBB3_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0
; NOLSE-NEXT: ret
;
@@ -273,31 +203,15 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4:
; NOLSE: // %bb.0:
-; NOLSE-NEXT: ldr s1, [x0]
-; NOLSE-NEXT: b .LBB4_2
; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1
-; NOLSE-NEXT: fmov s1, w10
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.eq .LBB4_5
-; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB4_3 Depth 2
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxr w8, [x0]
+; NOLSE-NEXT: fmov s1, w8
; NOLSE-NEXT: fadd s2, s1, s0
-; NOLSE-NEXT: fmov w9, s1
; NOLSE-NEXT: fmov w8, s2
-; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxr w10, [x0]
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.ne .LBB4_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2
-; NOLSE-NEXT: stlxr wzr, w8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB4_3
-; NOLSE-NEXT: b .LBB4_1
-; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxr w9, w8, [x0]
+; NOLSE-NEXT: cbnz w9, .LBB4_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: fmov s0, s1
; NOLSE-NEXT: ret
;
@@ -324,31 +238,15 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0
define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8:
; NOLSE: // %bb.0:
-; NOLSE-NEXT: ldr d1, [x0]
-; NOLSE-NEXT: b .LBB5_2
; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1
-; NOLSE-NEXT: fmov d1, x10
-; NOLSE-NEXT: cmp x10, x9
-; NOLSE-NEXT: b.eq .LBB5_5
-; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB5_3 Depth 2
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxr x8, [x0]
+; NOLSE-NEXT: fmov d1, x8
; NOLSE-NEXT: fadd d2, d1, d0
-; NOLSE-NEXT: fmov x9, d1
; NOLSE-NEXT: fmov x8, d2
-; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxr x10, [x0]
-; NOLSE-NEXT: cmp x10, x9
-; NOLSE-NEXT: b.ne .LBB5_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2
-; NOLSE-NEXT: stlxr wzr, x8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB5_3
-; NOLSE-NEXT: b .LBB5_1
-; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxr w9, x8, [x0]
+; NOLSE-NEXT: cbnz w9, .LBB5_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: fmov d0, d1
; NOLSE-NEXT: ret
;
@@ -375,54 +273,26 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #
define fp128 @test_atomicrmw_fadd_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align16:
; NOLSE: // %bb.0:
-; NOLSE-NEXT: sub sp, sp, #96
-; NOLSE-NEXT: ldr q1, [x0]
-; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; NOLSE-NEXT: sub sp, sp, #80
+; NOLSE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
; NOLSE-NEXT: mov x19, x0
-; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill
-; NOLSE-NEXT: b .LBB6_2
+; NOLSE-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1
-; NOLSE-NEXT: stp x12, x13, [sp, #32]
-; NOLSE-NEXT: cmp x13, x10
-; NOLSE-NEXT: ldr q1, [sp, #32]
-; NOLSE-NEXT: ccmp x12, x11, #0, eq
-; NOLSE-NEXT: b.eq .LBB6_6
-; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB6_3 Depth 2
-; NOLSE-NEXT: mov v0.16b, v1.16b
-; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
-; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxp x8, x9, [x19]
+; NOLSE-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
+; NOLSE-NEXT: stp x8, x9, [sp, #48]
+; NOLSE-NEXT: ldr q0, [sp, #48]
+; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill
; NOLSE-NEXT: bl __addtf3
-; NOLSE-NEXT: str q0, [sp, #48]
-; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
-; NOLSE-NEXT: ldp x9, x8, [sp, #48]
-; NOLSE-NEXT: str q0, [sp, #64]
-; NOLSE-NEXT: ldp x11, x10, [sp, #64]
-; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxp x12, x13, [x19]
-; NOLSE-NEXT: cmp x12, x11
-; NOLSE-NEXT: cset w14, ne
-; NOLSE-NEXT: cmp x13, x10
-; NOLSE-NEXT: cinc w14, w14, ne
-; NOLSE-NEXT: cbz w14, .LBB6_5
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2
-; NOLSE-NEXT: stlxp w14, x12, x13, [x19]
-; NOLSE-NEXT: cbnz w14, .LBB6_3
-; NOLSE-NEXT: b .LBB6_1
-; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2
-; NOLSE-NEXT: stlxp w14, x9, x8, [x19]
-; NOLSE-NEXT: cbnz w14, .LBB6_3
-; NOLSE-NEXT: b .LBB6_1
-; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end
-; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; NOLSE-NEXT: mov v0.16b, v1.16b
-; NOLSE-NEXT: add sp, sp, #96
+; NOLSE-NEXT: str q0, [sp, #32]
+; NOLSE-NEXT: ldp x9, x8, [sp, #32]
+; NOLSE-NEXT: stlxp w10, x9, x8, [x19]
+; NOLSE-NEXT: cbnz w10, .LBB6_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; NOLSE-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; NOLSE-NEXT: add sp, sp, #80
; NOLSE-NEXT: ret
;
; LSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align16:
@@ -463,35 +333,19 @@ define fp128 @test_atomicrmw_fadd_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0
define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4:
; NOLSE: // %bb.0:
-; NOLSE-NEXT: fcvtl v1.4s, v0.4h
-; NOLSE-NEXT: ldr s0, [x0]
-; NOLSE-NEXT: b .LBB7_2
+; NOLSE-NEXT: fcvtl v0.4s, v0.4h
; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1
-; NOLSE-NEXT: fmov s0, w10
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.eq .LBB7_5
-; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB7_3 Depth 2
-; NOLSE-NEXT: fcvtl v2.4s, v0.4h
-; NOLSE-NEXT: fmov w9, s0
-; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s
-; NOLSE-NEXT: fcvtn v2.4h, v2.4s
-; NOLSE-NEXT: fmov w8, s2
-; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxr w10, [x0]
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.ne .LBB7_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2
-; NOLSE-NEXT: stlxr wzr, w8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB7_3
-; NOLSE-NEXT: b .LBB7_1
-; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end
-; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxr w8, [x0]
+; NOLSE-NEXT: fmov s1, w8
+; NOLSE-NEXT: fcvtl v1.4s, v1.4h
+; NOLSE-NEXT: fadd v1.4s, v1.4s, v0.4s
+; NOLSE-NEXT: fcvtn v1.4h, v1.4s
+; NOLSE-NEXT: fmov w9, s1
+; NOLSE-NEXT: stlxr w10, w9, [x0]
+; NOLSE-NEXT: cbnz w10, .LBB7_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT: fmov d0, x8
; NOLSE-NEXT: ret
;
; LSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4:
@@ -522,38 +376,22 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
; NOLSE: // %bb.0:
; NOLSE-NEXT: movi v1.4s, #1
; NOLSE-NEXT: movi v2.4s, #127, msl #8
-; NOLSE-NEXT: shll v3.4s, v0.4h, #16
-; NOLSE-NEXT: ldr s0, [x0]
-; NOLSE-NEXT: b .LBB8_2
+; NOLSE-NEXT: shll v0.4s, v0.4h, #16
; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1
-; NOLSE-NEXT: fmov s0, w10
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.eq .LBB8_5
-; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB8_3 Depth 2
-; NOLSE-NEXT: shll v4.4s, v0.4h, #16
-; NOLSE-NEXT: fmov w9, s0
-; NOLSE-NEXT: fadd v4.4s, v4.4s, v3.4s
-; NOLSE-NEXT: ushr v5.4s, v4.4s, #16
-; NOLSE-NEXT: and v5.16b, v5.16b, v1.16b
-; NOLSE-NEXT: add v4.4s, v5.4s, v4.4s
-; NOLSE-NEXT: addhn v4.4h, v4.4s, v2.4s
-; NOLSE-NEXT: fmov w8, s4
-; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxr w10, [x0]
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.ne .LBB8_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2
-; NOLSE-NEXT: stlxr wzr, w8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB8_3
-; NOLSE-NEXT: b .LBB8_1
-; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end
-; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxr w8, [x0]
+; NOLSE-NEXT: fmov s3, w8
+; NOLSE-NEXT: shll ...
[truncated]
|
Please also fix the comment before AArch64TargetLowering::shouldExpandAtomicRMWInIR . (The comment is wrong; a floating-point "exception" isn't an exception in the sense that it would cancel an exclusive reservation.) |
6b636d0
to
da57a21
Compare
ce17dc9
to
d576f3e
Compare
Just thought of this, but... we can't do this in the case where we do a libcall. Any load or store between the load exclusive and the store exclusive could break the reservation. (It normally won't, but it can in weird cases where the atomic variable is on the stack.) So we have to use cmpxchg lowering in those cases (and then expand the cmpxchg to ll/sc). The cases we can do inline should be fine, though. |
da57a21
to
38ce457
Compare
d576f3e
to
5c3c318
Compare
38ce457
to
07be231
Compare
5c3c318
to
5e13bdd
Compare
ping |
case Type::DoubleTyID: | ||
case Type::HalfTyID: | ||
case Type::BFloatTyID: | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need to check if we're in softfp mode (Subtarget->hasFPARMv8()
).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems to not do anything?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
aarch64 has fp enabled by default; you need something like -mattr=-fp-armv8
to test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(I'm confident Subtarget->hasFPARMv8() is the right check; it's the same check used to enable the registers.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is in the test (in the parent #103701)
07be231
to
f988002
Compare
5e13bdd
to
5bed50c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
f988002
to
3fff86f
Compare
5bed50c
to
5af8653
Compare
3fff86f
to
07ad2a3
Compare
5af8653
to
bc9f9cd
Compare
07ad2a3
to
eb21c2b
Compare
bc9f9cd
to
58082aa
Compare
eb21c2b
to
9c924de
Compare
58082aa
to
ad70ac1
Compare
Merge activity
|
9c924de
to
03949db
Compare
ad70ac1
to
2573ff3
Compare
Use LLSC or cmpxchg in the same cases as for the unsupported integer operations. This required some fixups to the LLSC implementatation to deal with the fp128 case.
2573ff3
to
bf31e46
Compare
Use LLSC or cmpxchg in the same cases as for the unsupported
integer operations. This required some fixups to the LLSC
implementatation to deal with the fp128 case.
The comment about floating-point exceptions was wrong,
because floating-point exceptions are not really exceptions at all.